In [7]:
import numpy as np
import csv
import os
import sys

data_filename = os.getcwd()+"/data/ionosphere.data"

X = np.zeros((351,34),dtype='float')# 矩阵
Y = np.zeros((351,),dtype='bool')  # 特征

In [11]:
with open(data_filename,'r') as input_file:
    reader = csv.reader(input_file)
    for i,row in enumerate(reader):
        data = [float(datum) for datum in row[:-1]]
        X[i] = data
        Y[i] = row[-1] == 'g' # 如果是g 就是1 否则 0 Ture|False
        
print(X[1,2])


1.0

In [12]:
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# 创建训练集和测试集
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,random_state=14)

# 初始化knn类,默认为选择5个邻近作为分类依据
estimator = KNeighborsClassifier()
# 训练
estimator.fit(X_train,Y_train)
# 测试集测试计算
Y_predicted = estimator.predict(X_test)

accuracy = np.mean(Y_test == Y_predicted) * 100
print("The accuracy is {0:.1f}%".format(accuracy))


The accuracy is 86.4%

scikit-learn提 了几种交验方法

1、cross_val_score默认Stratified K Fold方法切分数据


In [13]:
from sklearn.cross_validation import cross_val_score
estimator = KNeighborsClassifier()#默认取的是邻近的5个 
scores = cross_val_score(estimator, X, Y, scoring='accuracy')
average_accuracy = np.mean(scores) * 100
print("The average accuracy is {0:.1f}%".format(average_accuracy))


The average accuracy is 82.3%

设置n_neighbors 不对的取值对应的结果

%matplotlib inline


In [19]:
from matplotlib import pyplot as plt

avg_scores = []
all_scores = []
parameter_values = list(range(1, 21))  # Include 20
for n_neighbors in parameter_values:
    estimator = KNeighborsClassifier(n_neighbors=n_neighbors) 
    scores = cross_val_score(estimator, X, Y, scoring='accuracy')
    avg_scores.append(np.mean(scores))
    all_scores.append(scores)
plt.plot(parameter_values,avg_scores, '-o')
plt.show()



In [ ]: